Project authors and participants:

Dont forget the upvote if you liked the post!

1 Problem Definition

Each row contains the text of a tweet and a sentiment label. In the training set you are provided with a word or phrase drawn from the tweet (selected_text) that encapsulates the provided sentiment.


Source: https://www.nablustv.net/Uploads/Image/138264199060853557824.jpg

Premise:

Objective: predict the word or phrase from the tweet that exemplifies the provided sentiment

Load dependencies:

library(tidymodels) # tidy machine learning
library(readr)      # read/write
library(dplyr)      # manipulate data
library(tidyr)      # tidy data
library(purrr)      # functional programming
library(stringr)    # text manipulation
library(qdapRegex)  # easy regex
library(tm)         # text mining
library(tidytext)   # text mining
library(ggplot2)    # elegant graphs
library(patchwork)  # grid ggplot
library(doParallel) # parallel process
library(foreach)    # parallel process

theme_set(theme_bw()) # set theme

# Install external package:
if(require(textfeatures) == T){ library(textfeatures) } else{
  devtools::install("../input/r-textfeatures-package/textfeatures/")
  library(textfeatures)        
}

ncores <- 4

Function to evaluate model performance:

jaccard <- function(str1, str2) {
  # r version for: 
  # https://www.kaggle.com/c/tweet-sentiment-extraction/overview/evaluation
  a <- unlist(strsplit(tolower(str1), split = " "))
  b <- unlist(strsplit(tolower(str2), split = " "))
  c <- intersect(a, b)
  length(c) / (length(a) + length(b) - length(c))
}

2 Available data

Load available training and test data:

train_data <- read_csv("../data/train.csv") %>% rename(sel_text = selected_text) %>% sample_n(1000)
test_data <- read_csv("../data/test.csv")

Remove missing:

# remove na
train_data <- train_data %>% filter(!is.na(text) | text == "")

Check Jaccard by sentiment using full text:

train_data %>% 
  rowwise() %>% 
  mutate(jaccard = jaccard(text, sel_text)) %>% 
  ungroup() %>% 
  group_by(sentiment) %>% 
  summarise(jaccard = mean(jaccard))
FALSE # A tibble: 3 x 2
FALSE   sentiment jaccard
FALSE   <chr>       <dbl>
FALSE 1 negative    0.327
FALSE 2 neutral     0.914
FALSE 3 positive    0.305

Note that the jaccard of the neutral feeling is quite high when selecting all the text. So, let’s hold all the neutral texts before modeling

# Train
train_neutral <- train_data %>% filter(sentiment == "neutral")
train_data    <- train_data %>% filter(sentiment != "neutral")
# Test
test_neutral <- test_data %>% filter(sentiment == "neutral")
test_data    <- test_data %>% filter(sentiment != "neutral")

Remove lines from the training data where sel_text is not contained in the text

bad_text <- train_data %>% 
  mutate(texts = map(text, ~str_split(.x, " ")[[1]]),
         sel_texts = map(sel_text, ~str_split(.x, " ")[[1]]),
         bad_text = map2_lgl(texts, sel_texts, ~ sum(.x %in% .y)==0) ) %>% 
  pull(bad_text)

train_data <- train_data[!bad_text,]

25 lines have been removed.

2.1 N-Gram

We will collect all possible ngrams to train a regression model that estimates a jaccard for each piece.

Ngrams are like this:


Source: https://deepai.org/machine-learning-glossary-and-terms/n-gram

code:

# Aux function to search special character:
to_search <- function(x){
  str_replace_all(x, "([[:punct:]]|\\*|\\+|\\.{1,}|\\:|\\$|\\:|\\^|\\?|\\|)", "\\\\\\1")
}

# train
train_ngrams <- 
  train_data %>% 
  mutate(n_words = map_dbl(text, ~str_split(.x, pattern = " ", )[[1]] %>% length())) %>% 
  mutate(ngram_text = map2(text, n_words,  function(text, n_words){
    map(1:n_words, 
        ~ tau::textcnt(text, method = "string", split = " ", n = .x, tolower = FALSE) %>% names() %>% unlist()
    ) } )) %>% 
  mutate(ngram_text = map(ngram_text, unlist)) %>% 
  unnest(cols = c(ngram_text)) %>% 
  mutate(sel = ngram_text == sel_text) %>% 
  mutate(dif_text = str_remove(text, to_search(ngram_text)))

# test
test_ngrams <- 
  test_data %>% 
  mutate(n_words = map_dbl(text, ~str_split(.x, pattern = " ", )[[1]] %>% length())) %>% 
  mutate(ngram_text = map2(text, n_words,  function(text, n_words){
    map(1:n_words, 
        ~ tau::textcnt(text, method = "string", split = " ", n = .x, tolower = FALSE) %>% names() %>% unlist()
    ) } )) %>% 
  mutate(ngram_text = map(ngram_text, unlist)) %>% 
  unnest(cols = c(ngram_text)) %>% 
  mutate(dif_text = str_remove(text, to_search(ngram_text)))

Calcule Jaccard betweed each ngram and sel_text (target)

train_ngrams <- train_ngrams %>% 
  mutate(jaccard = map2_dbl(sel_text, ngram_text, ~jaccard(.x, .y)))

Now let’s remove more bad lines than where the ngram is not contained in the text:

to_remove <- 
  train_ngrams %>% 
  group_by(textID) %>% 
  nest() %>% 
  ungroup() %>% 
  mutate(sel = map_lgl(data, ~any(.x$ngram_text == .x$sel_text))) %>% 
  filter(sel != T) %>% 
  pull(textID)
train_ngrams <- train_ngrams %>% filter(!textID %in% to_remove)

111 rows are removed

3 Exploratory Analysis

In this step, pipelines will be developed to collect metadata and parse the columns text,ngram_tex and dif_text.


Source: https://innovation.alteryx.com/natural-language-processing-featuretools/

3.1 Metadata

Let’s fit the regression model with tabular information about the text.

We developed a function that collects the metadata of each text / ngram / dif text:

get_metadata <- function(x, verbose = F){
  
  if(verbose == T){
    t0 <- Sys.time() # to print time
    cat("Getting metadata, please wait ..\n")  
  }
  
  # get metadata with `textfeatures`
  metadata <- textfeatures::textfeatures(x, normalize = F, word_dims = 0, verbose = verbose)
  
  # discart default n_words and n_uq_words
  metadata <- metadata %>% select(-n_words, -n_uq_words)
  
  # more features
  # quantas ngrams possiveis?
  # qual ngram antes e qual depois
  
  metadata <- 
    tibble(text = x) %>% 
    rowwise() %>% 
    mutate(
      n_words = length(str_split(text, pattern = " ")[[1]]),
      n_uq_words = length(unique(str_split(text, pattern = " ")[[1]]))) %>% 
    ungroup() %>% 
    transmute(
      n_vogals = str_count(str_to_lower(text), "[aeiou]"),
      n_consonants = str_count(str_to_lower(text), "[bcdfghjklmnpqrstvwxyz]"),
      n_str = str_length(text),
      # n_upper = str_count(text, "[A-Z]"), # n_caps
      n_neg = str_count(str_to_lower(text), "(\\bno+\\b|\\bnor+\\b|\\bnot+\\b|n\\'t\\b)"), # negatives
      n_atpeople = str_count(text, "@\\w+"),
      n_question = str_count(text, "\\?+"),
      # n_dot = str_count(text, "\\.+"), # n_period
      n_retweet = str_count(text, "(RT|via)((?:\\b\\W*@\\w+)+)")
    ) %>% 
    bind_cols(metadata)
  
  # combine plural person in metadata
  metadata <- metadata %>% 
    mutate(n_first_person = n_first_person + n_first_personp,
           n_second_person = n_second_person + n_second_personp) %>% 
    select(-n_first_personp, -n_second_personp)
  
  if(verbose == T){
    cat(paste0("Metadata successfully obtained!\nThe process took: ",
               round(difftime(Sys.time(), t0, units = "mins")) ," min\n")) # Yeah!  
  }  
  
  
  return(metadata)
}

get metadata from train:

# get text metadata
text_metadata <-
  bind_cols(tibble(textID = train_data$textID), get_metadata(train_data$text, verbose = T) %>% 
              `colnames<-`(paste0("text_",colnames(.)))) 
FALSE Getting metadata, please wait ..
FALSE ↪ Counting features in text...
FALSE ↪ Sentiment analysis...
FALSE ↪ Parts of speech...
FALSE ↪ Word dimensions started
FALSE ✔ Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 0 min
# get sel_text metadata
sel_text_metadata <-
  bind_cols(tibble(textID = train_ngrams$textID), get_metadata(train_ngrams$ngram_text, verbose = T) %>% 
              `colnames<-`(paste0("sel_text_",colnames(.)))) 
FALSE Getting metadata, please wait ..
FALSE ↪ Counting features in text...
FALSE ↪ Sentiment analysis...
FALSE ↪ Parts of speech...
FALSE ↪ Word dimensions started
FALSE ✔ Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 3 min
# get dif_text metadata
dif_text_metadata <-
  bind_cols(tibble(textID = train_ngrams$textID), get_metadata(train_ngrams$dif_text, verbose = T) %>% 
              `colnames<-`(paste0("dif_text_",colnames(.)))) 
FALSE Getting metadata, please wait ..
FALSE ↪ Counting features in text...
FALSE ↪ Sentiment analysis...
FALSE ↪ Parts of speech...
FALSE ↪ Word dimensions started
FALSE ✔ Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 3 min
# join all in metadata
train_metadata <- 
  left_join(
    bind_cols(sel_text_metadata, select(dif_text_metadata, -textID)),
    bind_cols(train_data, select(text_metadata, -textID)),
    by = "textID"
  ) %>% 
  bind_cols(select(train_ngrams, ngram_text, dif_text, jaccard, n_words)) %>% 
  select(textID, text, sel_text, ngram_text, dif_text, sentiment, n_words, jaccard, everything())
saveRDS(train_metadata, "../data/train_metadata.rds")
train_metadata
FALSE # A tibble: 52,911 x 119
FALSE    textID text  sel_text ngram_text dif_text sentiment n_words jaccard
FALSE    <chr>  <chr> <chr>    <chr>      <chr>    <chr>       <dbl>   <dbl>
FALSE  1 ff31a… Got … I knew … Got        " to go… positive       17   0    
FALSE  2 ff31a… Got … I knew … I          "Got to… positive       17   0.125
FALSE  3 ff31a… Got … I knew … Tesco's.   "Got to… positive       17   0    
FALSE  4 ff31a… Got … I knew … day        "Got to… positive       17   0.125
FALSE  5 ff31a… Got … I knew … down       "Got to… positive       17   0    
FALSE  6 ff31a… Got … I knew … go         "Got to… positive       17   0    
FALSE  7 ff31a… Got … I knew … going      "Got to… positive       17   0.125
FALSE  8 ff31a… Got … I knew … knew       "Got to… positive       17   0.125
FALSE  9 ff31a… Got … I knew … my         "Got to… positive       17   0.125
FALSE 10 ff31a… Got … I knew … shopping   "Got to… positive       17   0    
FALSE # … with 52,901 more rows, and 111 more variables: sel_text_n_vogals <int>,
FALSE #   sel_text_n_consonants <int>, sel_text_n_str <int>, sel_text_n_neg <int>,
FALSE #   sel_text_n_atpeople <int>, sel_text_n_question <int>,
FALSE #   sel_text_n_retweet <int>, sel_text_n_urls <int>, sel_text_n_uq_urls <int>,
FALSE #   sel_text_n_hashtags <int>, sel_text_n_uq_hashtags <int>,
FALSE #   sel_text_n_mentions <int>, sel_text_n_uq_mentions <int>,
FALSE #   sel_text_n_chars <int>, sel_text_n_uq_chars <int>, sel_text_n_commas <int>,
FALSE #   sel_text_n_digits <int>, sel_text_n_exclaims <int>,
FALSE #   sel_text_n_extraspaces <int>, sel_text_n_lowers <int>,
FALSE #   sel_text_n_lowersp <dbl>, sel_text_n_periods <int>, sel_text_n_caps <int>,
FALSE #   sel_text_n_nonasciis <int>, sel_text_n_puncts <int>,
FALSE #   sel_text_n_capsp <dbl>, sel_text_n_charsperword <dbl>,
FALSE #   sel_text_sent_afinn <dbl>, sel_text_sent_bing <dbl>,
FALSE #   sel_text_sent_syuzhet <dbl>, sel_text_sent_vader <dbl>,
FALSE #   sel_text_n_polite <dbl>, sel_text_n_first_person <int>,
FALSE #   sel_text_n_second_person <int>, sel_text_n_third_person <int>,
FALSE #   sel_text_n_tobe <int>, sel_text_n_prepositions <int>,
FALSE #   dif_text_n_vogals <int>, dif_text_n_consonants <int>, dif_text_n_str <int>,
FALSE #   dif_text_n_neg <int>, dif_text_n_atpeople <int>, dif_text_n_question <int>,
FALSE #   dif_text_n_retweet <int>, dif_text_n_urls <int>, dif_text_n_uq_urls <int>,
FALSE #   dif_text_n_hashtags <int>, dif_text_n_uq_hashtags <int>,
FALSE #   dif_text_n_mentions <int>, dif_text_n_uq_mentions <int>,
FALSE #   dif_text_n_chars <int>, dif_text_n_uq_chars <int>, dif_text_n_commas <int>,
FALSE #   dif_text_n_digits <int>, dif_text_n_exclaims <int>,
FALSE #   dif_text_n_extraspaces <int>, dif_text_n_lowers <int>,
FALSE #   dif_text_n_lowersp <dbl>, dif_text_n_periods <int>, dif_text_n_caps <int>,
FALSE #   dif_text_n_nonasciis <int>, dif_text_n_puncts <int>,
FALSE #   dif_text_n_capsp <dbl>, dif_text_n_charsperword <dbl>,
FALSE #   dif_text_sent_afinn <dbl>, dif_text_sent_bing <dbl>,
FALSE #   dif_text_sent_syuzhet <dbl>, dif_text_sent_vader <dbl>,
FALSE #   dif_text_n_polite <dbl>, dif_text_n_first_person <int>,
FALSE #   dif_text_n_second_person <int>, dif_text_n_third_person <int>,
FALSE #   dif_text_n_tobe <int>, dif_text_n_prepositions <int>, text_n_vogals <int>,
FALSE #   text_n_consonants <int>, text_n_str <int>, text_n_neg <int>,
FALSE #   text_n_atpeople <int>, text_n_question <int>, text_n_retweet <int>,
FALSE #   text_n_urls <int>, text_n_uq_urls <int>, text_n_hashtags <int>,
FALSE #   text_n_uq_hashtags <int>, text_n_mentions <int>, text_n_uq_mentions <int>,
FALSE #   text_n_chars <int>, text_n_uq_chars <int>, text_n_commas <int>,
FALSE #   text_n_digits <int>, text_n_exclaims <int>, text_n_extraspaces <int>,
FALSE #   text_n_lowers <int>, text_n_lowersp <dbl>, text_n_periods <int>,
FALSE #   text_n_caps <int>, text_n_nonasciis <int>, text_n_puncts <int>,
FALSE #   text_n_capsp <dbl>, …

get metadata from test:

# get text metadata
text_metadata <-
  bind_cols(tibble(textID = test_data$textID), get_metadata(test_data$text, verbose = T) %>% 
              `colnames<-`(paste0("text_",colnames(.)))) 
FALSE Getting metadata, please wait ..
FALSE ↪ Counting features in text...
FALSE ↪ Sentiment analysis...
FALSE ↪ Parts of speech...
FALSE ↪ Word dimensions started
FALSE ✔ Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 0 min
# get sel_text metadata
sel_text_metadata <-
  bind_cols(tibble(textID = test_ngrams$textID), get_metadata(test_ngrams$ngram_text, verbose = T) %>% 
              `colnames<-`(paste0("sel_text_",colnames(.)))) 
FALSE Getting metadata, please wait ..
FALSE ↪ Counting features in text...
FALSE ↪ Sentiment analysis...
FALSE ↪ Parts of speech...
FALSE ↪ Word dimensions started
FALSE ✔ Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 14 min
# get dif_text metadata
dif_text_metadata <-
  bind_cols(tibble(textID = test_ngrams$textID), get_metadata(test_ngrams$dif_text, verbose = T) %>% 
              `colnames<-`(paste0("dif_text_",colnames(.)))) 
FALSE Getting metadata, please wait ..
FALSE ↪ Counting features in text...
FALSE ↪ Sentiment analysis...
FALSE ↪ Parts of speech...
FALSE ↪ Word dimensions started
FALSE ✔ Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 15 min
# join all in metadata
test_metadata <- 
  left_join(
    bind_cols(sel_text_metadata, select(dif_text_metadata, -textID)),
    bind_cols(test_data, select(text_metadata, -textID)),
    by = "textID"
  ) %>% 
  bind_cols(select(test_ngrams, ngram_text, dif_text, n_words)) %>% 
  select(textID, text, ngram_text, dif_text, sentiment, n_words, everything())
saveRDS(test_metadata, "../data/test_metadata.rds")
test_metadata
FALSE # A tibble: 237,605 x 117
FALSE    textID text  ngram_text dif_text sentiment n_words sel_text_n_voga…
FALSE    <chr>  <chr> <chr>      <chr>    <chr>       <dbl>            <int>
FALSE  1 11aa4… http… -          "http:/… positive       13                0
FALSE  2 11aa4… http… Malta      "http:/… positive       13                2
FALSE  3 11aa4… http… but        "http:/… positive       13                1
FALSE  4 11aa4… http… calling    "http:/… positive       13                2
FALSE  5 11aa4… http… can't      "http:/… positive       13                1
FALSE  6 11aa4… http… from       "http:/… positive       13                1
FALSE  7 11aa4… http… http://tw… " - i w… positive       13                3
FALSE  8 11aa4… http… i          "http:/… positive       13                1
FALSE  9 11aa4… http… was        "http:/… positive       13                1
FALSE 10 11aa4… http… wish       "http:/… positive       13                1
FALSE # … with 237,595 more rows, and 110 more variables:
FALSE #   sel_text_n_consonants <int>, sel_text_n_str <int>, sel_text_n_neg <int>,
FALSE #   sel_text_n_atpeople <int>, sel_text_n_question <int>,
FALSE #   sel_text_n_retweet <int>, sel_text_n_urls <int>, sel_text_n_uq_urls <int>,
FALSE #   sel_text_n_hashtags <int>, sel_text_n_uq_hashtags <int>,
FALSE #   sel_text_n_mentions <int>, sel_text_n_uq_mentions <int>,
FALSE #   sel_text_n_chars <int>, sel_text_n_uq_chars <int>, sel_text_n_commas <int>,
FALSE #   sel_text_n_digits <int>, sel_text_n_exclaims <int>,
FALSE #   sel_text_n_extraspaces <int>, sel_text_n_lowers <int>,
FALSE #   sel_text_n_lowersp <dbl>, sel_text_n_periods <int>, sel_text_n_caps <int>,
FALSE #   sel_text_n_nonasciis <int>, sel_text_n_puncts <int>,
FALSE #   sel_text_n_capsp <dbl>, sel_text_n_charsperword <dbl>,
FALSE #   sel_text_sent_afinn <dbl>, sel_text_sent_bing <dbl>,
FALSE #   sel_text_sent_syuzhet <dbl>, sel_text_sent_vader <dbl>,
FALSE #   sel_text_n_polite <dbl>, sel_text_n_first_person <int>,
FALSE #   sel_text_n_second_person <int>, sel_text_n_third_person <int>,
FALSE #   sel_text_n_tobe <int>, sel_text_n_prepositions <int>,
FALSE #   dif_text_n_vogals <int>, dif_text_n_consonants <int>, dif_text_n_str <int>,
FALSE #   dif_text_n_neg <int>, dif_text_n_atpeople <int>, dif_text_n_question <int>,
FALSE #   dif_text_n_retweet <int>, dif_text_n_urls <int>, dif_text_n_uq_urls <int>,
FALSE #   dif_text_n_hashtags <int>, dif_text_n_uq_hashtags <int>,
FALSE #   dif_text_n_mentions <int>, dif_text_n_uq_mentions <int>,
FALSE #   dif_text_n_chars <int>, dif_text_n_uq_chars <int>, dif_text_n_commas <int>,
FALSE #   dif_text_n_digits <int>, dif_text_n_exclaims <int>,
FALSE #   dif_text_n_extraspaces <int>, dif_text_n_lowers <int>,
FALSE #   dif_text_n_lowersp <dbl>, dif_text_n_periods <int>, dif_text_n_caps <int>,
FALSE #   dif_text_n_nonasciis <int>, dif_text_n_puncts <int>,
FALSE #   dif_text_n_capsp <dbl>, dif_text_n_charsperword <dbl>,
FALSE #   dif_text_sent_afinn <dbl>, dif_text_sent_bing <dbl>,
FALSE #   dif_text_sent_syuzhet <dbl>, dif_text_sent_vader <dbl>,
FALSE #   dif_text_n_polite <dbl>, dif_text_n_first_person <int>,
FALSE #   dif_text_n_second_person <int>, dif_text_n_third_person <int>,
FALSE #   dif_text_n_tobe <int>, dif_text_n_prepositions <int>, text_n_vogals <int>,
FALSE #   text_n_consonants <int>, text_n_str <int>, text_n_neg <int>,
FALSE #   text_n_atpeople <int>, text_n_question <int>, text_n_retweet <int>,
FALSE #   text_n_urls <int>, text_n_uq_urls <int>, text_n_hashtags <int>,
FALSE #   text_n_uq_hashtags <int>, text_n_mentions <int>, text_n_uq_mentions <int>,
FALSE #   text_n_chars <int>, text_n_uq_chars <int>, text_n_commas <int>,
FALSE #   text_n_digits <int>, text_n_exclaims <int>, text_n_extraspaces <int>,
FALSE #   text_n_lowers <int>, text_n_lowersp <dbl>, text_n_periods <int>,
FALSE #   text_n_caps <int>, text_n_nonasciis <int>, text_n_puncts <int>,
FALSE #   text_n_capsp <dbl>, text_n_charsperword <dbl>, …

3.2 Parse

Function developed to calculate statistics for each ngram in relation to the entire text and for each one that remains after removing the ngrams in relation to the entire text

parse_metadata <- function(metadata, test = F){
  
metadata <-
  metadata %>% 
    mutate(
      # text stats
      text_n_words = n_words,
      # text_n_lowersp,
      # text_n_capsp,
      # text_n_charsperword,
      # sel_text stats        
      sel_text_n_words = map_dbl(ngram_text, ~length(str_split(.x, pattern = " ")[[1]])),
      # sel_text_n_lowersp,
      # sel_text_n_capsp,
      # sel_text_n_charsperword,
      # interaction sel_text x text
      sd_sel_text_sent_afinn        = text_sent_afinn - sel_text_sent_afinn,
      sd_sel_text_sent_bing         = text_sent_bing - sel_text_sent_bing,
      sd_sel_text_sent_syuzhet      = text_sent_syuzhet - sel_text_sent_syuzhet,
      sd_sel_text_sent_vader        = text_sent_vader - sel_text_sent_vader,
      sd_sel_text_n_polite          = text_n_polite - sel_text_n_polite,
      prop_sel_text_n_vogals        = if_else(text_n_vogals == 0, 0, sel_text_n_vogals / text_n_vogals),
      prop_sel_text_n_consonants    = if_else(text_n_consonants == 0, 0, sel_text_n_consonants / text_n_consonants),
      prop_sel_text_n_str           = if_else(text_n_str == 0, 0, sel_text_n_str / text_n_str),
      prop_sel_text_len             = text_n_words / sel_text_n_words,
      prop_sel_text_n_chars         = if_else(text_n_chars == 0, 0, sel_text_n_chars / text_n_chars),
      prop_sel_text_n_uq_chars      = if_else(text_n_uq_chars == 0, 0, sel_text_n_uq_chars / text_n_uq_chars),
      prop_sel_text_n_lowers        = if_else(text_n_lowers == 0, 0, sel_text_n_lowers / text_n_lowers),
      prop_sel_text_n_caps          = if_else(text_n_caps == 0, 0, sel_text_n_caps / text_n_caps),
      prop_sel_text_n_periods       = if_else(text_n_periods == 0, 0, sel_text_n_periods / text_n_periods),
      prop_sel_text_n_commas        = if_else(text_n_commas == 0, 0, sel_text_n_commas / text_n_commas),
      prop_sel_text_n_exclaims      = if_else(text_n_exclaims == 0, 0, sel_text_n_exclaims / text_n_exclaims),
      prop_sel_text_n_puncts        = if_else(text_n_puncts == 0, 0, sel_text_n_puncts / text_n_puncts),
      prop_sel_text_n_prepositions  = if_else(text_n_prepositions == 0, 0, sel_text_n_prepositions / text_n_prepositions),
      cat_sel_text_n_neg            = if_else(sel_text_n_neg == 0, "no", "yes"),
      cat_sel_text_n_question       = if_else(sel_text_n_question == 0, "no", "yes"),
      cat_sel_text_n_digits         = if_else(sel_text_n_digits == 0, "no", "yes"),
      cat_sel_text_n_extraspaces    = if_else(sel_text_n_extraspaces == 0, "no", "yes"),
      cat_sel_text_n_tobe           = if_else(sel_text_n_tobe == 0, "no", "yes"),
      cat_sel_text_n_first_person   = if_else(sel_text_n_first_person == 0, "no", "yes"),
      cat_sel_text_n_second_person  = if_else(sel_text_n_second_person == 0, "no", "yes"),
      cat_sel_text_n_third_person   = if_else(sel_text_n_third_person == 0, "no", "yes"),
      
      # dif_text stats
      dif_text_n_words = map_dbl(dif_text, ~length(str_split(.x, pattern = " ")[[1]])),
      # dif_text_n_lowersp,
      # dif_text_n_capsp,
      # dif_text_n_charsperword,
      # interaction dif_text x text
      sd_dif_text_sent_afinn        = text_sent_afinn - dif_text_sent_afinn,
      sd_dif_text_sent_bing         = text_sent_bing - dif_text_sent_bing,
      sd_dif_text_sent_syuzhet      = text_sent_syuzhet - dif_text_sent_syuzhet,
      sd_dif_text_sent_vader        = text_sent_vader - dif_text_sent_vader,
      sd_dif_text_n_polite          = text_n_polite - dif_text_n_polite,
      prop_dif_text_n_vogals        = if_else(text_n_vogals == 0, 0, dif_text_n_vogals / text_n_vogals),
      prop_dif_text_n_consonants    = if_else(text_n_consonants == 0, 0, dif_text_n_consonants / text_n_consonants),
      prop_dif_text_n_str           = if_else(text_n_str == 0, 0, dif_text_n_str / text_n_str),
      prop_dif_text_len             = dif_text_n_words / text_n_words,
      prop_dif_text_n_chars         = if_else(text_n_chars == 0, 0, dif_text_n_chars / text_n_chars),
      prop_dif_text_n_uq_chars      = if_else(text_n_uq_chars == 0, 0, dif_text_n_uq_chars / text_n_uq_chars),
      prop_dif_text_n_lowers        = if_else(text_n_lowers == 0, 0, dif_text_n_lowers / text_n_lowers),
      prop_dif_text_n_caps          = if_else(text_n_caps == 0, 0, dif_text_n_caps / text_n_caps),
      prop_dif_text_n_periods       = if_else(text_n_periods == 0, 0, dif_text_n_periods / text_n_periods),
      prop_dif_text_n_commas        = if_else(text_n_commas == 0, 0, dif_text_n_commas / text_n_commas),
      prop_dif_text_n_exclaims      = if_else(text_n_exclaims == 0, 0, dif_text_n_exclaims / text_n_exclaims),
      prop_dif_text_n_puncts        = if_else(text_n_puncts == 0, 0, dif_text_n_puncts / text_n_puncts),
      prop_dif_text_n_prepositions  = if_else(text_n_prepositions == 0, 0, dif_text_n_prepositions / text_n_prepositions),
      cat_dif_text_n_neg            = if_else(dif_text_n_neg == 0, "no", "yes"),
      cat_dif_text_n_question       = if_else(dif_text_n_question == 0, "no", "yes"),
      cat_dif_text_n_digits         = if_else(dif_text_n_digits == 0, "no", "yes"),
      cat_dif_text_n_extraspaces    = if_else(dif_text_n_extraspaces == 0, "no", "yes"),
      cat_dif_text_n_tobe           = if_else(dif_text_n_tobe == 0, "no", "yes"),
      cat_dif_text_n_first_person   = if_else(dif_text_n_first_person == 0, "no", "yes"),
      cat_dif_text_n_second_person  = if_else(dif_text_n_second_person == 0, "no", "yes"),
      cat_dif_text_n_third_person   = if_else(dif_text_n_third_person == 0, "no", "yes"),
    )  
    
    if(test == F){
        metadata %>%
            select(textID, text,  sel_text, ngram_text, dif_text, sentiment, jaccard,everything())
        
    }else{
        metadata %>%
            select(textID, text, ngram_text, dif_text, sentiment,everything())
    }
    
}

Apply function in train and test data:

ttrain <- parse_metadata(train_metadata, test = F) %>% select(-sel_text)
ttest  <- parse_metadata(test_metadata, test = T)

ttrain$sel_text <- NULL

Remove bad textIDs again (without jaccard = 1):

to_remove <- 
  ttrain %>% 
  group_by(textID) %>% 
  nest() %>% 
  ungroup() %>% 
  mutate(sel = map_lgl(data, ~any(.x$jaccard == 1))) %>% 
  filter(sel != T) %>% 
  pull(textID)
ttrain <- ttrain %>% filter(!textID %in% to_remove)

Sample ngrams of each text that have the same jaccard

g1 <- 
  ttrain %>% 
  ggplot(aes(x = jaccard, fill = sentiment))+
  geom_density(alpha = .5)+
  labs(title = "before random sample")

ttrain %>%
  mutate(jaccard = case_when(jaccard == 0 ~ 0,
                             T ~ 1)) %>%
  filter(!is.na(jaccard)) %>% 
  count(jaccard) %>% mutate(prop = n/sum(n))
FALSE # A tibble: 2 x 3
FALSE   jaccard     n  prop
FALSE     <dbl> <int> <dbl>
FALSE 1       0 25853 0.517
FALSE 2       1 24167 0.483

There are many zeros, let’s sample ngrams:

set.seed(1)

ttrain <- 
  ttrain %>% 
  group_by(textID) %>% nest() %>% ungroup() %>% 
  mutate(data = map(data, ~.x %>% 
                      mutate(rounded_jaccard = round(jaccard, 2)) %>% 
                      group_by(rounded_jaccard) %>%
                      sample_n(1)%>% 
                      ungroup())) %>% 
  unnest(cols = c(data)) %>%
  select(-rounded_jaccard) 

g2 <- 
  ttrain %>% 
  ggplot(aes(x = jaccard, fill = sentiment))+
  geom_density(alpha = .5)+
  labs(title = "after random sample")

g1 / g2

4 Modeling

For machine learning, we will use the tidymodels framework:


Source: https://rviews.rstudio.com/2019/06/19/a-gentle-intro-to-tidymodels/

Pre processing with recipes:

jaccard_recipe <- recipe(ttrain, jaccard ~ .) %>%
  step_rm(textID, text, ngram_text, dif_text) %>% 
  step_mutate(sentiment = case_when(sentiment == "positive"~1,
                              sentiment == "negative"~-1)) %>% 
  step_YeoJohnson(all_numeric(),-all_outcomes(), -sentiment) %>%
  step_normalize(all_numeric(),-all_outcomes()) %>%
  step_dummy(all_nominal())

Define cross validation with recipes:

set.seed(123)
jaccard_vfold <- vfold_cv(ttrain, v = 5, strata = jaccard)

Define model with parsnip:

jaccard_xgb_model <- 
  boost_tree(
  trees = tune(), 
  learn_rate = tune(), # step size
  tree_depth = tune(), min_n = tune(),
  loss_reduction = tune(), # first three: model complexity
  sample_size = tune(), mtry = tune(), # randomness
  ) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost", nthread = ncores)

Start xgboost workflow:

jaccard_workflow <- workflow() %>% add_recipe(jaccard_recipe)
jaccard_xgb_workflow <-jaccard_workflow %>% add_model(jaccard_xgb_model)

Determine params

xgb_params <- parameters(
  trees(),
  learn_rate(), # step size
  tree_depth(), min_n(), 
  loss_reduction(), # first three: model complexity
  sample_size = sample_prop(), finalize(mtry(), ttrain)  # randomness
)

xgb_params <- xgb_params %>% update(trees = trees(c(100, 500))) 

Atualize xgboost workflow:

workflow_jaccard_xgb_model <- 
  workflow() %>% 
  add_model(jaccard_xgb_model) %>% 
  add_recipe(jaccard_recipe)

Iterative Bayesian optimization of a regression model:

set.seed(321)
xgb_tune <-
  workflow_jaccard_xgb_model %>%
  tune_bayes(
    resamples = jaccard_vfold,
    param_info = xgb_params,
    # initial = ?,
    iter = 30, 
    # metrics = metric_set(rmse, mape),
    control = control_bayes(no_improve = 10, 
                            save_pred = T, verbose = T)
  )

autoplot(xgb_tune)

Colect predictions:

collect_predictions(xgb_tune) %>% 
  select(id,.pred, jaccard) %>% 
  gather(key, value, -id) %>% 
  ggplot(aes(x=value, volor = key, fill = key)) + 
  geom_density(alpha=.2)+ 
  labs(x = "", y = "")+
  facet_wrap(~id)+
  theme(legend.position = "bottom")

Select best model:

jaccard_best_model <- select_best(xgb_tune, "rmse", maximize = F)
print(jaccard_best_model)
FALSE # A tibble: 1 x 7
FALSE    mtry trees min_n tree_depth learn_rate loss_reduction sample_size
FALSE   <int> <int> <int>      <int>      <dbl>          <dbl>       <dbl>
FALSE 1    30   124    15         14     0.0981    0.000000205       0.861

Fit final model:

jaccard_final_model <- finalize_model(jaccard_xgb_model, jaccard_best_model)
jaccard_workflow    <- workflow_jaccard_xgb_model %>% update_model(jaccard_final_model)
jaccard_xgb_fit     <- fit(jaccard_workflow, data = ttrain)

Predict jaccard for all test ngrams:

pred <- predict(jaccard_xgb_fit, ttest)

results <- 
  ttest %>% 
  bind_cols(as_tibble(pred)) %>% 
  select(textID, text, ngram_text, .pred) %>% 
  group_by(textID) %>% 
  top_n(1, .pred) %>%
  distinct(textID, .pred, .keep_all = T) %>%
  ungroup()

head(results)
FALSE # A tibble: 6 x 4
FALSE   textID    text                                                ngram_text .pred
FALSE   <chr>     <chr>                                               <chr>      <dbl>
FALSE 1 11aa4945… http://twitpic.com/67swx - i wish i was calling yo… wish       0.850
FALSE 2 fd1db57d… i'm done.haha. HOUSE MD marathon ulet               marathon   0.539
FALSE 3 2524332d… I'm concerned for that family                       concerned  0.736
FALSE 4 0fb19285… HEY GUYS IT'S WORKING NO NEED TO WORRY. i have too… WORRY.     1.06 
FALSE 5 311d2b18… Tracy and Berwick breaks my achy breaky heart  The… breaks     0.675
FALSE 6 95dfefd4… Well off 2 bed...cant wait 2 party 4 Mother's Day … like       0.750

5 Conclusion

Prepare to submit!

submission <- read_csv("../data/sample_submission.csv")

submission <- 
  submission %>%
  select(-selected_text) %>%
  left_join(
    bind_rows(
      select(results, textID, selected_text = ngram_text),
      select(test_neutral, textID, selected_text = text)
    )
  )

6 References

https://www.kaggle.com/nkoprowicz/a-simple-solution-using-only-word-counts https://www.kaggle.com/khoongweihao/feature-engineering-lightgbm-model-starter-kit https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/139803 https://www.kaggle.com/jonathanbesomi/question-answering-starter-pack https://machinelearningmastery.com/gentle-introduction-text-summarization/ https://www.tidymodels.org/learn/work/bayes-opt/